Machine Learning Engineer Nanodegree

Capstone Project

📑   P6: Sberbank Russian Housing Market

In [1]:
%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Orbitron|Roboto');
body {background-color: honeydew;} 
a {color: #31c831; font-family: 'Roboto';} 
h1 {color: forestgreen; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #ccc;} 
h2, h3 {color: slategray; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #ccc;}
h4 {color: #31c831; font-family: 'Roboto';}
span {text-shadow: 4px 4px 4px #ccc;}
div.output_prompt, div.output_area pre {color: slategray;}
div.input_prompt, div.output_subarea {color: forestgreen;}      
div.output_stderr pre {background-color: ghostwhite;}  
div.output_stderr {background-color: slategrey;}                        
</style>
<script>
code_show = true; 
function code_display() {
    if (code_show) {
        $('div.input').each(function(id) {
            if (id == 0 || $(this).html().indexOf('hide_code') > -1) {$(this).hide();}
        });
        $('div.output_prompt').css('opacity', 0);
    } else {
        $('div.input').each(function(id) {$(this).show();});
        $('div.output_prompt').css('opacity', 1);
    };
    code_show = !code_show;
} 
$(document).ready(code_display);
</script>
<form action="javascript: code_display()">
<input style="color: forestgreen; background: honeydew; opacity: 0.8;" \ 
type="submit" value="Click to display or hide code cells">
</form>      
In [2]:
hide_code = ''
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import scipy

import seaborn as sns
import matplotlib.pylab as plt

from random import random
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML, SVG

from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.model_selection import KFold, ParameterGrid, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV, BayesianRidge
from sklearn.linear_model import HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

import keras as ks
from keras.models import Sequential, load_model, Model
from keras.optimizers import SGD, RMSprop
from keras.layers import Dense, Dropout, LSTM, GlobalAveragePooling1D
from keras.layers import Activation, Flatten, Input, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.wrappers.scikit_learn import KerasRegressor
from keras.utils.vis_utils import model_to_dot
Using TensorFlow backend.
In [3]:
hide_code
def regression(regressor, x_train, x_test, y_train):
    reg = regressor
    reg.fit(x_train, y_train)
    
    y_train_reg = reg.predict(x_train)
    y_test_reg = reg.predict(x_test)
    
    return y_train_reg, y_test_reg

def history_plot(fit_history):
    plt.figure(figsize=(18, 12))
    
    plt.subplot(211)
    plt.plot(fit_history.history['loss'], color='#348ABD', label = 'train')
    plt.plot(fit_history.history['val_loss'], color='#228B22', label = 'test')
    plt.legend()
    plt.title('Loss Function');  
    
    plt.subplot(212)
    plt.plot(fit_history.history['mean_absolute_error'], color='#348ABD', label = 'train')
    plt.plot(fit_history.history['val_mean_absolute_error'], color='#228B22', label = 'test')
    plt.legend()
    plt.title('Mean Absolute Error');   

def scores(regressor, y_train, y_test, y_train_reg, y_test_reg):
    separator1, separator2 = '<_>'*18, '-'*10
    print(separator1, '\n', regressor, '\n'+separator1)
    print("EV score. Train: ", explained_variance_score(y_train, y_train_reg))
    print("EV score. Test: ", explained_variance_score(y_test, y_test_reg))
    print(separator2)
    print("R2 score. Train: ", r2_score(y_train, y_train_reg))
    print("R2 score. Test: ", r2_score(y_test, y_test_reg))
    print(separator2)
    print("MSE score. Train: ", mean_squared_error(y_train, y_train_reg))
    print("MSE score. Test: ", mean_squared_error(y_test, y_test_reg))
    print(separator2)
    print("MAE score. Train: ", mean_absolute_error(y_train, y_train_reg))
    print("MAE score. Test: ", mean_absolute_error(y_test, y_test_reg))
    print(separator2)
    print("MdAE score. Train: ", median_absolute_error(y_train, y_train_reg))
    print("MdAE score. Test: ", median_absolute_error(y_test, y_test_reg))
    
def scores2(regressor, target, target_predict):
    separator1, separator2 = '<_>'*18, '-'*10
    print(separator1, '\n', regressor, '\n'+separator1)
    print("EV score:", explained_variance_score(target, target_predict))
    print(separator2)
    print("R2 score:", r2_score(target, target_predict))
    print(separator2)
    print("MSE score:", mean_squared_error(target, target_predict))
    print(separator2)
    print("MAE score:", mean_absolute_error(target, target_predict))
    print(separator2)
    print("MdAE score:", median_absolute_error(target, target_predict))

Capstone Proposal Overview

In this capstone project proposal, prior to completing the following Capstone Project, we will leverage what we've learned throughout the Nanodegree program to author a proposal for solving a problem of our choice by applying machine learning algorithms and techniques. A project proposal encompasses seven key points:

  • The project's domain background : the field of research where the project is derived;
  • A problem statement : a problem being investigated for which a solution will be defined;
  • The datasets and inputs : data or inputs being used for the problem;
  • A solution statement : a the solution proposed for the problem given;
  • A benchmark model : some simple or historical model or result to compare the defined solution to;
  • A set of evaluation metrics : functional representations for how the solution can be measured;
  • An outline of the project design : how the solution will be developed and results obtained.

Domain Background

Housing costs demand a significant investment from both consumers and developers. And when it comes to planning a budget—whether personal or corporate—the last thing anyone needs is uncertainty about one of their budgets expenses. Sberbank, Russia’s oldest and largest bank, helps their customers by making predictions about reality prices so renters, developers, and lenders are more confident when they sign a lease or purchase a building.

Although the housing market is relatively stable in Russia, the country’s volatile economy makes forecasting prices as a function of apartment characteristics a unique challenge. Complex interactions between housing features such as a number of bedrooms and location are enough to make pricing predictions complicated. Adding an unstable economy to the mix means Sberbank and their customers need more than simple regression models in their arsenal.

Problem Statement

Sberbank is challenging programmers to develop algorithms which use a broad spectrum of features to predict real prices. Competitors will rely on a rich dataset that includes housing data and macroeconomic patterns. An accurate forecasting model will allow Sberbank to provide more certainty to their customers in an uncertain economy.

Datasets and Inputs

Data Description (data_dictionary.txt)

In [4]:
hide_code
HTML('''<div id="data">
<p><iframe src="data_dictionary.txt" frameborder="3" height="300" width="99%"></iframe></p>
</div>''')
Out[4]:

Load and Display the Data

In [4]:
hide_code
macro = pd.read_csv('macro.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [5]:
hide_code
macro[100:110].T[1:15]
Out[5]:
100 101 102 103 104 105 106 107 108 109
oil_urals 82.87 82.87 82.87 82.87 82.87 82.87 82.87 82.87 82.87 82.87
gdp_quart 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8
gdp_quart_growth 4.1 4.1 4.1 4.1 4.1 4.1 4.1 4.1 4.1 4.1
cpi 319.8 319.8 319.8 319.8 319.8 319.8 319.8 319.8 319.8 319.8
ppi 350.2 350.2 350.2 350.2 350.2 350.2 350.2 350.2 350.2 350.2
gdp_deflator NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
balance_trade 16.604 16.604 16.604 16.604 16.604 16.604 16.604 16.604 16.604 16.604
balance_trade_growth 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1
usdrub 29.1525 29.0261 29.1 28.9194 29.0239 29.092 29.092 29.092 29.1835 29.1398
eurrub 39.2564 39.4051 39.5008 39.5233 39.3691 39.2524 39.2524 39.2524 39.3214 39.1532
brent 84.83 84.77 84.72 86.15 87.17 85.99 85.99 85.99 84.23 84.8
net_capital_export NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
gdp_annual 38807.2 38807.2 38807.2 38807.2 38807.2 38807.2 38807.2 38807.2 38807.2 38807.2
gdp_annual_growth -0.0782086 -0.0782086 -0.0782086 -0.0782086 -0.0782086 -0.0782086 -0.0782086 -0.0782086 -0.0782086 -0.0782086
In [6]:
hide_code
train[200:208].T[1:15]
Out[6]:
200 201 202 203 204 205 206 207
timestamp 2011-10-25 2011-10-25 2011-10-25 2011-10-25 2011-10-26 2011-10-26 2011-10-26 2011-10-26
full_sq 38 33 30 76 44 35 72 32
life_sq 19 14 18 51 29 21 45 18
floor 15 8 3 2 8 5 10 6
max_floor NaN NaN NaN NaN NaN NaN NaN NaN
material NaN NaN NaN NaN NaN NaN NaN NaN
build_year NaN NaN NaN NaN NaN NaN NaN NaN
num_room NaN NaN NaN NaN NaN NaN NaN NaN
kitch_sq NaN NaN NaN NaN NaN NaN NaN NaN
state NaN NaN NaN NaN NaN NaN NaN NaN
product_type Investment Investment Investment Investment Investment Investment Investment Investment
sub_area Horoshevskoe Juzhnoe Butovo Marfino Juzhnoportovoe Vostochnoe Izmajlovo Lefortovo Krylatskoe Chertanovo Juzhnoe
area_m 8.56843e+06 2.61551e+07 2.1044e+06 4.57959e+06 3.8e+06 8.99364e+06 1.21645e+07 9.28244e+06
raion_popul 56535 178264 26943 71715 76308 89971 78507 143661

Solution Statement

Selection of Features

In [7]:
hide_code
X_list_num = ['timestamp',
              'full_sq', 'num_room', 'area_m', 
              'kremlin_km', 'big_road2_km', 'big_road1_km',
              'workplaces_km',
              'stadium_km', 'swim_pool_km', 'fitness_km', 
              'detention_facility_km', 'cemetery_km',
              'radiation_km', 'oil_chemistry_km',
              'theater_km', 'exhibition_km', 'museum_km', 
              'park_km', 'public_healthcare_km',  
              'metro_min_walk','metro_km_avto', 
              'bus_terminal_avto_km', 'public_transport_station_min_walk',
              'railroad_station_walk_min', 'railroad_station_avto_km',
              'kindergarten_km', 'school_km', 'preschool_km',
              'university_km', 'additional_education_km',
              'shopping_centers_km', 'big_market_km',
              'ekder_all', 'work_all', 'young_all']

X_list_cat = ['sub_area', 'ID_metro', 
              'office_raion', 'sport_objects_raion',
              'raion_popul', 'healthcare_centers_raion',
              'school_education_centers_raion', 
              'preschool_education_centers_raion']

target_train = train['price_doc']
In [8]:
hide_code
plt.style.use('seaborn-whitegrid')
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 6))

sns.distplot(target_train, bins=200, color='#228B22', ax=ax1)
ax1.set_xlabel("Prices")

sns.distplot(np.log(target_train), bins=200, color='#228B22', ax=ax2)
ax2.set_xlabel("Logarithm of the variable 'Prices'")

plt.suptitle('Sberbank Russian Housing Data');
In [9]:
hide_code
print ("Sberbank Russian Housing Dataset Statistics: \n")
print ("Number of houses = ", len(target_train))
print ("Number of features = ", len(list(train[X_list_num+X_list_cat].keys())))
print ("Minimum house price = ", np.min(target_train))
print ("Maximum house price = ", np.max(target_train))
print ("Mean house price = ", "%.2f" % np.mean(target_train))
print ("Median house price = ", "%.2f" % np.median(target_train))
print ("Standard deviation of house prices =", "%.2f" % np.std(target_train))
Sberbank Russian Housing Dataset Statistics: 

Number of houses =  30471
Number of features =  44
Minimum house price =  100000
Maximum house price =  111111112
Mean house price =  7123035.28
Median house price =  6274411.00
Standard deviation of house prices = 4780032.89

Fill in Missing Values

In [10]:
hide_code
train[X_list_num].isnull().sum()
Out[10]:
timestamp                               0
full_sq                                 0
num_room                             9572
area_m                                  0
kremlin_km                              0
big_road2_km                            0
big_road1_km                            0
workplaces_km                           0
stadium_km                              0
swim_pool_km                            0
fitness_km                              0
detention_facility_km                   0
cemetery_km                             0
radiation_km                            0
oil_chemistry_km                        0
theater_km                              0
exhibition_km                           0
museum_km                               0
park_km                                 0
public_healthcare_km                    0
metro_min_walk                         25
metro_km_avto                           0
bus_terminal_avto_km                    0
public_transport_station_min_walk       0
railroad_station_walk_min              25
railroad_station_avto_km                0
kindergarten_km                         0
school_km                               0
preschool_km                            0
university_km                           0
additional_education_km                 0
shopping_centers_km                     0
big_market_km                           0
ekder_all                               0
work_all                                0
young_all                               0
dtype: int64
In [11]:
hide_code
test[X_list_num].isnull().sum()
Out[11]:
timestamp                             0
full_sq                               0
num_room                              0
area_m                                0
kremlin_km                            0
big_road2_km                          0
big_road1_km                          0
workplaces_km                         0
stadium_km                            0
swim_pool_km                          0
fitness_km                            0
detention_facility_km                 0
cemetery_km                           0
radiation_km                          0
oil_chemistry_km                      0
theater_km                            0
exhibition_km                         0
museum_km                             0
park_km                               0
public_healthcare_km                  0
metro_min_walk                       34
metro_km_avto                         0
bus_terminal_avto_km                  0
public_transport_station_min_walk     0
railroad_station_walk_min            34
railroad_station_avto_km              0
kindergarten_km                       0
school_km                             0
preschool_km                          0
university_km                         0
additional_education_km               0
shopping_centers_km                   0
big_market_km                         0
ekder_all                             0
work_all                              0
young_all                             0
dtype: int64
In [12]:
hide_code
df_train = pd.DataFrame(train, columns=X_list_num)
df_train_cat = pd.DataFrame(train, columns=X_list_num+X_list_cat)

df_test = pd.DataFrame(test, columns=X_list_num)
df_test_cat = pd.DataFrame(test, columns=X_list_num+X_list_cat)

df_train['prices'] = target_train
df_train_cat['prices'] = target_train

df_train = df_train.dropna(subset=['num_room'])
df_train_cat = df_train_cat.dropna(subset=['num_room'])

df_train['metro_min_walk'] = \
df_train['metro_min_walk'].interpolate(method='linear')
df_train_cat['metro_min_walk'] = \
df_train_cat['metro_min_walk'].interpolate(method='linear')

df_train['railroad_station_walk_min'] = \
df_train['railroad_station_walk_min'].interpolate(method='linear')
df_train_cat['railroad_station_walk_min'] = \
df_train_cat['railroad_station_walk_min'].interpolate(method='linear')

df_test['metro_min_walk'] = \
df_test['metro_min_walk'].interpolate(method='linear')
df_test_cat['metro_min_walk'] = \
df_test_cat['metro_min_walk'].interpolate(method='linear')

df_test['railroad_station_walk_min'] = \
df_test['railroad_station_walk_min'].interpolate(method='linear')
df_test_cat['railroad_station_walk_min'] = \
df_test_cat['railroad_station_walk_min'].interpolate(method='linear')

len(df_train)
Out[12]:
20899

Categorical and Macro Features

Add the Macro Feature

In [13]:
hide_code
usdrub_pairs = dict(zip(list(macro['timestamp']), list(macro['usdrub'])))
# salary_pairs = dict(zip(list(macro['timestamp']), list(macro['salary'])))

df_train['timestamp'].replace(usdrub_pairs,inplace=True)
df_train_cat['timestamp'].replace(usdrub_pairs,inplace=True)

df_test['timestamp'].replace(usdrub_pairs,inplace=True)
df_test_cat['timestamp'].replace(usdrub_pairs,inplace=True)

df_train.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_train_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)

df_test.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)

Preprocess Categorical Features

In [14]:
hide_code
separator = '<_>'*38
for df in [df_train_cat, df_test_cat]:
    print ('\n', separator)
    
    print('\nsub area')
    print('Number of categories:', len(set(df['sub_area'])))
    print(set(df['sub_area']))

    print('\nID metro')
    print('Number of categories:', len(set(df['ID_metro'])))
    print(set(df['ID_metro']))

    print('\noffice raion')
    print('Number of categories:', len(set(df['office_raion'])))
    print(set(df['office_raion']))

    print('\nsport objects raion')
    print('Number of categories:', len(set(df['sport_objects_raion'])))
    print(set(df_train_cat['sport_objects_raion']))

    print('\nraion popul')
    print('Number of categories:', len(set(df['raion_popul'])))
    print(set(df['raion_popul']))

    print('\nhealthcare centers raion')
    print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
    print(set(df['healthcare_centers_raion']))

    print('\nschool education centers raion')
    print('Number of categories:', len(set(df['school_education_centers_raion'])))
    print(set(df['school_education_centers_raion']))

    print('\npreschool education centers raion')
    print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
    print(set(df['preschool_education_centers_raion']))
 <_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>

sub area
Number of categories: 146
{'Vojkovskoe', 'Severnoe Izmajlovo', 'Kuncevo', 'Sokolinaja Gora', 'Butyrskoe', 'Pechatniki', 'Ramenki', 'Juzhnoe Medvedkovo', 'Poselenie Kokoshkino', 'Dmitrovskoe', 'Rostokino', 'Mozhajskoe', 'Troparevo-Nikulino', 'Severnoe Butovo', 'Mitino', 'Ostankinskoe', 'Solncevo', 'Teplyj Stan', 'Poselenie Moskovskij', "Sokol'niki", 'Poselenie Marushkinskoe', 'Ochakovo-Matveevskoe', 'Losinoostrovskoe', 'Matushkino', 'Ivanovskoe', 'Metrogorodok', 'Poselenie Desjonovskoe', 'Severnoe Tushino', 'Hovrino', "Mar'ino", 'Severnoe Medvedkovo', 'Brateevo', 'Chertanovo Severnoe', 'Izmajlovo', 'Chertanovo Juzhnoe', 'Poselenie Shherbinka', 'Poselenie Rogovskoe', 'Strogino', 'Lefortovo', 'Danilovskoe', 'Basmannoe', 'Poselenie Kievskij', 'Krjukovo', 'Jaroslavskoe', "Zamoskvorech'e", 'Meshhanskoe', 'Begovoe', "Kon'kovo", 'Beskudnikovskoe', 'Babushkinskoe', 'Zjuzino', 'Poselenie Sosenskoe', "Gol'janovo", 'Zapadnoe Degunino', 'Lianozovo', 'Jasenevo', 'Vostochnoe Degunino', 'Birjulevo Zapadnoe', 'Poselenie Filimonkovskoe', 'Akademicheskoe', 'Vnukovo', 'Golovinskoe', 'Sviblovo', 'Poselenie Vnukovskoe', 'Marfino', 'Timirjazevskoe', 'Novogireevo', 'Ljublino', 'Poselenie Krasnopahorskoe', 'Poselenie Shhapovskoe', 'Poselenie Rjazanovskoe', 'Kurkino', 'Veshnjaki', 'Nizhegorodskoe', 'Nekrasovka', 'Alekseevskoe', 'Nagatinskij Zaton', 'Poselenie Voskresenskoe', "Chertanovo Central'noe", 'Tverskoe', 'Vostochnoe Izmajlovo', 'Presnenskoe', 'Hamovniki', "Moskvorech'e-Saburovo", 'Severnoe', 'Kosino-Uhtomskoe', 'Troickij okrug', 'Juzhnoe Tushino', 'Cheremushki', 'Novokosino', 'Horoshevskoe', 'Orehovo-Borisovo Severnoe', 'Preobrazhenskoe', 'Poselenie Mosrentgen', 'Koptevo', 'Levoberezhnoe', 'Perovo', 'Vyhino-Zhulebino', 'Filevskij Park', 'Nagornoe', 'Birjulevo Vostochnoe', 'Pokrovskoe Streshnevo', "Krasnosel'skoe", 'Arbat', 'Molzhaninovskoe', 'Obruchevskoe', 'Poselenie Pervomajskoe', 'Poselenie Mihajlovo-Jarcevskoe', 'Gagarinskoe', 'Poselenie Novofedorovskoe', 'Sokol', 'Otradnoe', 'Vostochnoe', 'Savelki', 'Zjablikovo', 'Donskoe', 'Shhukino', 'Taganskoe', "Mar'ina Roshha", 'Prospekt Vernadskogo', 'Bogorodskoe', 'Bibirevo', "Tekstil'shhiki", 'Kapotnja', 'Poselenie Klenovskoe', 'Orehovo-Borisovo Juzhnoe', 'Juzhnoe Butovo', 'Horoshevo-Mnevniki', "Altuf'evskoe", 'Nagatino-Sadovniki', 'Staroe Krjukovo', 'Lomonosovskoe', 'Krylatskoe', 'Caricyno', 'Ajeroport', 'Kotlovka', 'Fili Davydkovo', 'Juzhnoportovoe', "Kuz'minki", 'Savelovskoe', 'Dorogomilovo', 'Rjazanskij', 'Novo-Peredelkino', 'Jakimanka', 'Silino', 'Poselenie Voronovskoe'}

ID metro
Number of categories: 219
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 141, 14, 16, 19, 20, 23, 24, 27, 37, 39, 45, 48, 56, 59, 73, 84, 87, 93}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 29}

raion popul
Number of categories: 146
{90114, 116742, 6161, 28179, 76308, 80917, 68630, 53786, 94236, 41504, 71715, 8227, 12327, 83502, 21040, 77878, 139322, 118843, 81980, 57405, 13890, 78418, 178264, 85083, 112221, 101982, 4199, 5740, 61039, 75377, 123000, 37502, 142462, 67710, 2693, 108171, 57995, 47245, 57999, 115352, 153248, 87713, 118945, 21155, 112804, 145576, 78507, 247469, 7341, 130229, 125111, 129207, 38075, 102590, 105663, 96959, 145088, 86206, 8384, 56535, 79576, 85721, 102618, 156377, 85219, 113897, 174831, 132349, 51455, 111874, 111374, 57107, 43795, 78616, 12061, 155427, 55590, 178473, 143661, 73007, 48439, 36154, 21819, 64317, 26943, 103746, 102726, 32071, 101708, 9553, 157010, 17236, 55125, 4949, 27992, 130396, 165727, 94561, 94564, 7538, 89971, 28537, 89467, 76156, 17790, 76670, 2942, 83844, 123280, 166803, 80791, 60315, 175518, 4001, 142243, 64931, 83369, 125354, 102828, 37807, 111023, 155572, 65972, 73148, 31167, 39873, 3521, 72131, 85956, 106445, 7122, 26578, 61396, 219609, 78810, 104410, 91100, 81887, 19940, 100846, 122862, 32241, 104434, 2546, 122873, 76284}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13}

 <_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>

sub area
Number of categories: 145
{'Vojkovskoe', 'Severnoe Izmajlovo', 'Kuncevo', 'Sokolinaja Gora', 'Butyrskoe', 'Pechatniki', 'Ramenki', 'Juzhnoe Medvedkovo', 'Poselenie Kokoshkino', 'Dmitrovskoe', 'Mozhajskoe', 'Rostokino', 'Troparevo-Nikulino', 'Mitino', 'Ostankinskoe', 'Teplyj Stan', 'Solncevo', 'Severnoe Butovo', 'Poselenie Moskovskij', "Sokol'niki", 'Poselenie Marushkinskoe', 'Ochakovo-Matveevskoe', 'Losinoostrovskoe', 'Matushkino', 'Ivanovskoe', 'Metrogorodok', 'Poselenie Desjonovskoe', 'Severnoe Tushino', 'Hovrino', "Mar'ino", 'Severnoe Medvedkovo', 'Brateevo', 'Izmajlovo', 'Chertanovo Severnoe', 'Poselenie Shherbinka', 'Chertanovo Juzhnoe', 'Poselenie Rogovskoe', 'Strogino', 'Lefortovo', 'Danilovskoe', 'Basmannoe', 'Poselenie Kievskij', 'Krjukovo', 'Jaroslavskoe', "Zamoskvorech'e", 'Meshhanskoe', 'Begovoe', "Kon'kovo", 'Beskudnikovskoe', 'Babushkinskoe', 'Zjuzino', 'Poselenie Sosenskoe', 'Zapadnoe Degunino', "Gol'janovo", 'Jasenevo', 'Lianozovo', 'Vostochnoe Degunino', 'Birjulevo Zapadnoe', 'Poselenie Filimonkovskoe', 'Akademicheskoe', 'Vnukovo', 'Timirjazevskoe', 'Sviblovo', 'Poselenie Vnukovskoe', 'Ljublino', 'Marfino', 'Golovinskoe', 'Novogireevo', 'Poselenie Shhapovskoe', 'Poselenie Krasnopahorskoe', 'Poselenie Rjazanovskoe', 'Kurkino', 'Veshnjaki', 'Nizhegorodskoe', 'Nekrasovka', 'Alekseevskoe', 'Nagatinskij Zaton', 'Poselenie Voskresenskoe', "Chertanovo Central'noe", 'Tverskoe', 'Vostochnoe Izmajlovo', 'Presnenskoe', 'Hamovniki', "Moskvorech'e-Saburovo", 'Severnoe', 'Kosino-Uhtomskoe', 'Juzhnoe Tushino', 'Troickij okrug', 'Cheremushki', 'Horoshevskoe', 'Novokosino', 'Orehovo-Borisovo Severnoe', 'Preobrazhenskoe', 'Poselenie Mosrentgen', 'Koptevo', 'Levoberezhnoe', 'Perovo', 'Vyhino-Zhulebino', 'Filevskij Park', 'Nagornoe', 'Birjulevo Vostochnoe', 'Pokrovskoe Streshnevo', "Krasnosel'skoe", 'Arbat', 'Molzhaninovskoe', 'Obruchevskoe', 'Poselenie Pervomajskoe', 'Poselenie Mihajlovo-Jarcevskoe', 'Poselenie Novofedorovskoe', 'Gagarinskoe', 'Sokol', 'Otradnoe', 'Vostochnoe', 'Savelki', 'Zjablikovo', 'Donskoe', 'Shhukino', 'Taganskoe', "Mar'ina Roshha", 'Prospekt Vernadskogo', 'Bogorodskoe', 'Bibirevo', 'Kapotnja', "Tekstil'shhiki", 'Orehovo-Borisovo Juzhnoe', 'Juzhnoe Butovo', "Altuf'evskoe", 'Horoshevo-Mnevniki', 'Nagatino-Sadovniki', 'Staroe Krjukovo', 'Lomonosovskoe', 'Krylatskoe', 'Caricyno', 'Ajeroport', 'Kotlovka', 'Fili Davydkovo', 'Savelovskoe', 'Dorogomilovo', "Kuz'minki", 'Juzhnoportovoe', 'Rjazanskij', 'Novo-Peredelkino', 'Jakimanka', 'Silino', 'Poselenie Voronovskoe'}

ID metro
Number of categories: 212
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 165, 166, 167, 168, 170, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 199, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 219, 220, 221, 222, 224}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 141, 14, 16, 19, 20, 23, 24, 27, 37, 39, 45, 48, 56, 59, 73, 84, 87, 93}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 29}

raion popul
Number of categories: 145
{90114, 116742, 6161, 28179, 76308, 80917, 68630, 53786, 94236, 41504, 71715, 8227, 12327, 83502, 21040, 77878, 139322, 118843, 81980, 57405, 13890, 78418, 178264, 85083, 112221, 101982, 4199, 5740, 61039, 75377, 123000, 37502, 67710, 142462, 2693, 57995, 108171, 47245, 57999, 115352, 153248, 118945, 87713, 21155, 112804, 145576, 78507, 247469, 7341, 130229, 129207, 125111, 38075, 86206, 96959, 145088, 102590, 105663, 8384, 56535, 79576, 156377, 102618, 85721, 85219, 113897, 174831, 132349, 51455, 111874, 111374, 57107, 43795, 78616, 12061, 155427, 55590, 178473, 143661, 73007, 48439, 36154, 21819, 64317, 26943, 103746, 102726, 32071, 101708, 9553, 157010, 17236, 55125, 4949, 27992, 130396, 165727, 94561, 94564, 7538, 89971, 28537, 89467, 76156, 76670, 17790, 83844, 123280, 166803, 80791, 60315, 175518, 4001, 64931, 142243, 83369, 125354, 102828, 111023, 37807, 155572, 65972, 73148, 31167, 39873, 3521, 72131, 85956, 106445, 7122, 26578, 61396, 219609, 78810, 104410, 91100, 81887, 19940, 122862, 100846, 32241, 2546, 104434, 122873, 76284}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13}
In [15]:
hide_code
for feature in X_list_cat:
    for element in list(set(df_test_cat[feature])):
        if element not in list(set(df_train_cat[feature])): 
            print (feature, element)
ID_metro 224
In [16]:
hide_code
ID_metro_cat = pd.factorize(df_train_cat['ID_metro'])
df_train_cat['ID_metro'] = ID_metro_cat[0]

ID_metro_pairs = dict(zip(list(ID_metro_cat[1]), list(set(ID_metro_cat[0]))))
ID_metro_pairs[224] = 219

df_test_cat['ID_metro'].replace(ID_metro_pairs,inplace=True)
In [17]:
hide_code
for feature in X_list_cat:
    if feature !='ID_metro':
        feature_cat = pd.factorize(df_train_cat[feature])
        df_train_cat[feature] = feature_cat[0]
        feature_pairs = dict(zip(list(feature_cat[1]), list(set(feature_cat[0]))))
        df_test_cat[feature].replace(feature_pairs,inplace=True)
In [18]:
hide_code
for df in [df_train_cat, df_test_cat]:
    print ('\n', separator)
    
    print('\nsub area')
    print('Number of categories:', len(set(df['sub_area'])))
    print(set(df['sub_area']))

    print('\nID metro')
    print('Number of categories:', len(set(df['ID_metro'])))
    print(set(df['ID_metro']))

    print('\noffice raion')
    print('Number of categories:', len(set(df['office_raion'])))
    print(set(df['office_raion']))

    print('\nsport objects raion')
    print('Number of categories:', len(set(df['sport_objects_raion'])))
    print(set(df_train_cat['sport_objects_raion']))

    print('\nraion popul')
    print('Number of categories:', len(set(df['raion_popul'])))
    print(set(df['raion_popul']))

    print('\nhealthcare centers raion')
    print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
    print(set(df['healthcare_centers_raion']))

    print('\nschool education centers raion')
    print('Number of categories:', len(set(df['school_education_centers_raion'])))
    print(set(df['school_education_centers_raion']))

    print('\npreschool education centers raion')
    print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
    print(set(df['preschool_education_centers_raion']))
 <_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>

sub area
Number of categories: 146
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145}

ID metro
Number of categories: 219
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}

raion popul
Number of categories: 146
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

 <_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>

sub area
Number of categories: 145
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145}

ID metro
Number of categories: 212
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 180, 181, 182, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 206, 207, 208, 209, 210, 211, 212, 213, 215, 218, 219}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}

raion popul
Number of categories: 145
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
In [19]:
hide_code
df_train_cat1 = df_train_cat
encode = OneHotEncoder(sparse=False)

for column in X_list_cat:
    encode.fit(df_train_cat[[column]])
    transform = encode.transform(df_train_cat[[column]])
    
    transform = pd.DataFrame(transform, 
                             columns=[(column+"_"+str(i)) for i in df_train_cat[column].value_counts().index])
    transform = transform.set_index(df_train_cat.index.values)
    
    df_train_cat1 = pd.concat([df_train_cat1, transform], axis=1)
    df_train_cat1 = df_train_cat1.drop(column, 1)
In [20]:
hide_code
df_test_cat1 = df_test_cat
encode = OneHotEncoder(sparse=False)

for column in X_list_cat:
    encode.fit(df_test_cat[[column]])
    transform = encode.transform(df_test_cat[[column]])
    
    transform = pd.DataFrame(transform, 
                             columns=[(column+"_"+str(i)) for i in df_test_cat[column].value_counts().index])
    transform = transform.set_index(df_test_cat.index.values)
    
    df_test_cat1 = pd.concat([df_test_cat1, transform], axis=1)
    df_test_cat1 = df_test_cat1.drop(column, 1)

Check Encoding

In [21]:
hide_code
df_train_cat1.iloc[:, 623:636][:3].as_matrix()
Out[21]:
array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])
In [22]:
hide_code
df_train_cat['preschool_education_centers_raion'][:3]
Out[22]:
7672    0
8056    1
8111    2
Name: preschool_education_centers_raion, dtype: int64

Add Missing Columns with Zero Values

In [23]:
hide_code
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
Shape of the train data frame: (20899, 636)
Shape of the test data frame: (7662, 626)
In [24]:
hide_code
print("Features in the train data, but not in the test data:")
for element in list(df_train_cat1):
    if element not in list(df_test_cat1):
        print(element)
Features in the train data, but not in the test data:
prices
sub_area_136
ID_metro_188
ID_metro_205
ID_metro_216
ID_metro_214
ID_metro_183
ID_metro_179
ID_metro_153
ID_metro_217
raion_popul_136
In [25]:
hide_code
print("Features in the test data, but not in the train data:")
for element in list(df_test_cat1):
    if element not in list(df_train_cat1):
        print(element)
Features in the test data, but not in the train data:
ID_metro_219
In [26]:
hide_code
for column in ['sub_area_136',' ID_metro_188', 'ID_metro_205', 'ID_metro_216', 'ID_metro_214',
              'ID_metro_183',' ID_metro_179', 'ID_metro_153', 'ID_metro_217', 'raion_popul_136']:
    df_test_cat1[column] = 0
    
df_train_cat1['ID_metro_219'] = 0

print('Columns with zero values were added.\n')
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
Columns with zero values were added.

Shape of the train data frame: (20899, 637)
Shape of the test data frame: (7662, 636)

Display Correlation

In [27]:
hide_code
pearson = df_train.corr(method='pearson')
corr_with_prices = pearson.ix[-1][:-1]
corr_with_prices[abs(corr_with_prices).argsort()[::-1]]
Out[27]:
full_sq                              0.593829
num_room                             0.476337
kremlin_km                          -0.290126
stadium_km                          -0.238431
detention_facility_km               -0.233395
university_km                       -0.222964
theater_km                          -0.222873
workplaces_km                       -0.220889
swim_pool_km                        -0.220480
exhibition_km                       -0.212144
radiation_km                        -0.208256
museum_km                           -0.203846
park_km                             -0.201636
metro_min_walk                      -0.200058
fitness_km                          -0.197702
metro_km_avto                       -0.194751
shopping_centers_km                 -0.182459
public_healthcare_km                -0.182388
big_road2_km                        -0.178865
bus_terminal_avto_km                -0.176601
ekder_all                            0.169331
area_m                              -0.167851
school_km                           -0.158775
preschool_km                        -0.157079
additional_education_km             -0.146074
kindergarten_km                     -0.141627
work_all                             0.136761
railroad_station_walk_min           -0.135099
oil_chemistry_km                    -0.134873
railroad_station_avto_km            -0.132209
young_all                            0.131324
public_transport_station_min_walk   -0.128647
big_road1_km                        -0.098968
usdrub                               0.069506
big_market_km                       -0.069257
cemetery_km                         -0.042413
Name: prices, dtype: float64
In [28]:
hide_code
features_list2 = corr_with_prices[abs(corr_with_prices).argsort()[::-1]][:32].index.values.tolist()
print('The most correlated with prices:\n', features_list2)
The most correlated with prices:
 ['full_sq', 'num_room', 'kremlin_km', 'stadium_km', 'detention_facility_km', 'university_km', 'theater_km', 'workplaces_km', 'swim_pool_km', 'exhibition_km', 'radiation_km', 'museum_km', 'park_km', 'metro_min_walk', 'fitness_km', 'metro_km_avto', 'shopping_centers_km', 'public_healthcare_km', 'big_road2_km', 'bus_terminal_avto_km', 'ekder_all', 'area_m', 'school_km', 'preschool_km', 'additional_education_km', 'kindergarten_km', 'work_all', 'railroad_station_walk_min', 'oil_chemistry_km', 'railroad_station_avto_km', 'young_all', 'public_transport_station_min_walk']

Scale, Shuffle and Split the Data

In [29]:
hide_code
target_train = df_train['prices'].as_matrix()

features_train = df_train.drop('prices', 1).as_matrix()
features_test = df_test.as_matrix()

features_train_cat = df_train_cat.drop('prices', 1).as_matrix()
features_test_cat = df_test_cat.as_matrix()

features_train_cat_enc = df_train_cat1.drop('prices', 1).as_matrix()
features_test_cat_enc = df_test_cat1.as_matrix()
In [30]:
hide_code
print(separator, '\n\nNumeric Features')
X_train, X_test, y_train, y_test = \
train_test_split(features_train, target_train, test_size = 0.2, random_state = 1)
X_train.shape, X_test.shape
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 

Numeric Features
Out[30]:
((16719, 36), (4180, 36))
In [31]:
hide_code
print(separator, '\n\nNumeric and Categorical Features')
X_train_cat, X_test_cat, y_train_cat, y_test_cat = \
train_test_split(features_train_cat, target_train, test_size = 0.2, random_state = 1)
X_train_cat.shape, X_test_cat.shape
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 

Numeric and Categorical Features
Out[31]:
((16719, 44), (4180, 44))
In [32]:
hide_code
print(separator, '\n\nNumeric and Encoded Categorical Features')
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc, y_test_cat_enc = \
train_test_split(features_train_cat_enc, target_train, test_size = 0.2, random_state = 1)
X_train_cat_enc.shape, X_test_cat_enc.shape
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 

Numeric and Encoded Categorical Features
Out[32]:
((16719, 636), (4180, 636))
In [33]:
hide_code
scale_X = RobustScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)

scale_y = RobustScaler()
y_train = scale_y.fit_transform(y_train.reshape(-1,1))
y_test = scale_y.transform(y_test.reshape(-1,1))

scale_X_cat = RobustScaler()
X_train_cat = scale_X_cat.fit_transform(X_train_cat)
X_test_cat = scale_X_cat.transform(X_test_cat)

scale_y_cat = RobustScaler()
y_train_cat = scale_y_cat.fit_transform(y_train_cat.reshape(-1,1))
y_test_cat = scale_y_cat.transform(y_test_cat.reshape(-1,1))

scale_X_cat_enc = RobustScaler()
X_train_cat_enc = scale_X_cat_enc.fit_transform(X_train_cat_enc)
X_test_cat_enc = scale_X_cat_enc.transform(X_test_cat_enc)

scale_y_cat_enc = RobustScaler()
y_train_cat_enc = scale_y_cat_enc.fit_transform(y_train_cat_enc.reshape(-1,1))
y_test_cat_enc = scale_y_cat_enc.transform(y_test_cat_enc.reshape(-1,1))

Benchmark Models

Regressors; Scikit-Learn

Tuning Parameters

In [37]:
hide_code
print(separator, '\n\nNumeric Features', '\nGradient Boosting Regressor')
param_grid_gbr = {'max_depth': [3, 4, 5], 'n_estimators': range(36, 361, 36)}
gridsearch_gbr = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr, n_jobs=5)\
                             .fit(X_train, y_train)
gridsearch_gbr.best_params_
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 

Numeric Features 
Gradient Boosting Regressor
Out[37]:
{'max_depth': 4, 'n_estimators': 360}
In [82]:
hide_code
print ('Bagging Regressor')
param_grid_br = {'n_estimators': range(36, 361, 36)}
gridsearch_br = GridSearchCV(BaggingRegressor(), param_grid_br, n_jobs=5)\
                            .fit(X_train, y_train)
gridsearch_br.best_params_
Bagging Regressor
Out[82]:
{'n_estimators': 360}
In [35]:
hide_code
print(separator, '\n\nNumeric and Categorical Features', '\nGradient Boosting Regressor')
param_grid_gbr_cat = {'max_depth': [3, 4, 5], 'n_estimators': range(44, 441, 44)}
gridsearch_gbr_cat = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr_cat, n_jobs=5)\
                                 .fit(X_train_cat, y_train_cat)
gridsearch_gbr_cat.best_params_
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 

Numeric and Categorical Features 
Gradient Boosting Regressor
Out[35]:
{'max_depth': 3, 'n_estimators': 396}
In [36]:
hide_code
print ('Bagging Regressor')
param_grid_br_cat = {'n_estimators': range(44, 441, 44)}
gridsearch_br_cat = GridSearchCV(BaggingRegressor(), param_grid_br_cat, n_jobs=5)\
                                .fit(X_train_cat, y_train_cat)
gridsearch_br_cat.best_params_
Bagging Regressor
Out[36]:
{'n_estimators': 308}
In [40]:
hide_code
print(separator, '\n\nNumeric and Encoded Categorical Features', '\nGradient Boosting Regressor')
param_grid_gbr_cat_enc = {'max_depth': [3, 4, 5], 'n_estimators': [159, 318, 636]}
gridsearch_gbr_cat_enc = GridSearchCV(GradientBoostingRegressor(), param_grid_gbr_cat_enc, n_jobs=5)\
                                     .fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_gbr_cat_enc.best_params_
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 

Numeric and Encoded Categorical Features 
Gradient Boosting Regressor
Out[40]:
{'max_depth': 4, 'n_estimators': 318}
In [44]:
hide_code
print ('Bagging Regressor')
param_grid_br_cat_enc = {'n_estimators': [159, 318, 636]}
gridsearch_br_cat_enc = GridSearchCV(BaggingRegressor(), param_grid_br_cat_enc, n_jobs=5)\
                                    .fit(X_train_cat_enc, y_train_cat_enc)
gridsearch_br_cat_enc.best_params_
Bagging Regressor
Out[44]:
{'n_estimators': 159}

Fit the Regressors

In [35]:
hide_code
print(separator, '\nNumeric Features')
y_train_gbr, y_test_gbr = regression(GradientBoostingRegressor(max_depth=4, n_estimators=360), 
                                     X_train, X_test, y_train)

y_train_br, y_test_br = regression(BaggingRegressor(n_estimators=360), 
                                   X_train, X_test, y_train)

scores('GradientBoostingRegressor', y_train, y_test, y_train_gbr, y_test_gbr)
scores('BaggingRegressor', y_train, y_test, y_train_br, y_test_br)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 GradientBoostingRegressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.86189746402
EV score. Test:  0.72137950021
----------
R2 score. Train:  0.86189746402
R2 score. Test:  0.721288744262
----------
MSE score. Train:  0.251150449123
MSE score. Test:  0.557674434564
----------
MAE score. Train:  0.31458911313
MAE score. Test:  0.400674681789
----------
MdAE score. Train:  0.174402117839
MdAE score. Test:  0.197971759701
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 BaggingRegressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.955627109863
EV score. Test:  0.71803758233
----------
R2 score. Train:  0.955605318842
R2 score. Test:  0.717703884253
----------
MSE score. Train:  0.0807352597289
MSE score. Test:  0.564847394886
----------
MAE score. Train:  0.147313571728
MAE score. Test:  0.392232842331
----------
MdAE score. Train:  0.0637926350309
MdAE score. Test:  0.178704680195
In [36]:
hide_code
print(separator, '\nNumeric and Categorical Features')
y_train_cat_gbr, y_test_cat_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=396), 
           X_train_cat, X_test_cat, y_train_cat)

y_train_cat_br, y_test_cat_br = \
regression(BaggingRegressor(n_estimators=308), X_train_cat, X_test_cat, y_train_cat)

scores('GradientBoostingRegressor', 
       y_train_cat, y_test_cat, y_train_cat_gbr, y_test_cat_gbr)
scores('BaggingRegressor', 
       y_train_cat, y_test_cat, y_train_cat_br, y_test_cat_br)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 GradientBoostingRegressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.819256487057
EV score. Test:  0.717198284173
----------
R2 score. Train:  0.819256487057
R2 score. Test:  0.717120072882
----------
MSE score. Train:  0.328696458248
MSE score. Test:  0.566015545327
----------
MAE score. Train:  0.352419590753
MAE score. Test:  0.407314797978
----------
MdAE score. Train:  0.190394737254
MdAE score. Test:  0.204378587242
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 BaggingRegressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.955791223091
EV score. Test:  0.715612108446
----------
R2 score. Train:  0.955760452896
R2 score. Test:  0.715177071939
----------
MSE score. Train:  0.0804531361087
MSE score. Test:  0.569903303464
----------
MAE score. Train:  0.147071970282
MAE score. Test:  0.394235991547
----------
MdAE score. Train:  0.0636800742544
MdAE score. Test:  0.181725523088
In [37]:
hide_code
print(separator, '\nNumeric and Encoded Categorical Features')
y_train_cat_enc_gbr, y_test_cat_enc_gbr = \
regression(GradientBoostingRegressor(max_depth=4, n_estimators=318), 
           X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)

y_train_cat_enc_br, y_test_cat_enc_br = \
regression(BaggingRegressor(n_estimators=159), 
           X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)

scores('GradientBoostingRegressor', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_gbr, y_test_cat_enc_gbr)
scores('BaggingRegressor', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_br, y_test_cat_enc_br)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Encoded Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 GradientBoostingRegressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.845218755194
EV score. Test:  0.709068549988
----------
R2 score. Train:  0.845218755194
R2 score. Test:  0.708937700181
----------
MSE score. Train:  0.281482008082
MSE score. Test:  0.582387686656
----------
MAE score. Train:  0.330491575879
MAE score. Test:  0.404205001441
----------
MdAE score. Train:  0.180651597294
MdAE score. Test:  0.199672380395
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 BaggingRegressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.954978555519
EV score. Test:  0.714800235444
----------
R2 score. Train:  0.95495121038
R2 score. Test:  0.714391146873
----------
MSE score. Train:  0.0819248080074
MSE score. Test:  0.571475863981
----------
MAE score. Train:  0.147949895276
MAE score. Test:  0.394313992554
----------
MdAE score. Train:  0.0644131062194
MdAE score. Test:  0.180543970199

MLP Regressors

In [155]:
hide_code
mlpr = MLPRegressor(hidden_layer_sizes=(324,), max_iter=200, 
                    solver='lbfgs', alpha=0.01)
mlpr.fit(X_train, y_train)

y_train_mlpr = mlpr.predict(X_train)
y_test_mlpr = mlpr.predict(X_test)

print(separator, '\nNumeric Features')
scores('MLP Regressor', y_train, y_test, y_train_mlpr, y_test_mlpr)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.70262853816
EV score. Test:  0.687383005771
----------
R2 score. Train:  0.702627541531
R2 score. Test:  0.687296586598
----------
MSE score. Train:  0.540795474691
MSE score. Test:  0.625689474913
----------
MAE score. Train:  0.420978142615
MAE score. Test:  0.437882466234
----------
MdAE score. Train:  0.233990388649
MdAE score. Test:  0.239899428994
In [156]:
hide_code
mlpr_cat = MLPRegressor(hidden_layer_sizes=(396,), max_iter=200, 
                        solver='lbfgs', alpha=0.01)
mlpr_cat.fit(X_train_cat, y_train_cat)

y_train_cat_mlpr = mlpr_cat.predict(X_train_cat)
y_test_cat_mlpr = mlpr_cat.predict(X_test_cat)

print(separator, '\nNumeric and Categorical Features')
scores('MLP Regressor', y_train_cat, y_test_cat, y_train_cat_mlpr, y_test_cat_mlpr)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.721716305379
EV score. Test:  0.685607658089
----------
R2 score. Train:  0.721697977088
R2 score. Test:  0.685546588486
----------
MSE score. Train:  0.506114370385
MSE score. Test:  0.629191052936
----------
MAE score. Train:  0.410047655345
MAE score. Test:  0.439993670296
----------
MdAE score. Train:  0.223439022222
MdAE score. Test:  0.239014246519
In [157]:
hide_code
mlpr_cat_enc = MLPRegressor(hidden_layer_sizes=(318,), max_iter=200, 
                            solver='lbfgs', alpha=0.01)
mlpr_cat_enc.fit(X_train_cat_enc, y_train_cat_enc)

y_train_cat_enc_mlpr = mlpr_cat_enc.predict(X_train_cat_enc)
y_test_cat_enc_mlpr = mlpr_cat_enc.predict(X_test_cat_enc)

print(separator, '\nNumeric and Encoded Categorical Features')
scores('MLP Regressor', y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlpr, y_test_cat_enc_mlpr)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Encoded Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.754017252062
EV score. Test:  0.696561834111
----------
R2 score. Train:  0.754016034965
R2 score. Test:  0.696381826873
----------
MSE score. Train:  0.447341410909
MSE score. Test:  0.607510782346
----------
MAE score. Train:  0.395354888752
MAE score. Test:  0.440226832127
----------
MdAE score. Train:  0.210840433384
MdAE score. Test:  0.233123393999

Display Predictions

In [41]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')

plt.plot(y_test_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_br[1:50], label='Bagging Regressor')
plt.plot(y_test_mlpr[1:50], label='MLP Regressor')

plt.legend()
plt.title("Numeric Features; Regressor Predictions vs Real Data");
In [42]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_mlpr[1:50], label='MLP Regressor')

plt.legend()
plt.title("Numeric and Categorical Features; Regressor Predictions vs Real Data");
In [43]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat_enc[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_enc_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_enc_mlpr[1:50], label='MLP Regressor')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Regressor Predictions vs Real Data");

Neural Networks; Keras

MLP

In [139]:
hide_code
def mlp_model():
    model = Sequential()

    model.add(Dense(108, activation='relu', input_dim=36))
    model.add(Dense(108, activation='relu'))
    
    model.add(Dropout(0.1))
    
    model.add(Dense(256, activation='relu'))
    model.add(Dense(256, activation='relu'))
    
    model.add(Dropout(0.1))
    
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
        
    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
    return model

mlp_model = mlp_model()
mlp_checkpointer = ModelCheckpoint(filepath='weights.best.mlp.hdf5', 
                                   verbose=2, save_best_only=True)
mlp_history = mlp_model.fit(X_train, y_train, 
                            validation_data=(X_test, y_test),
                            nb_epoch=15, batch_size=128, verbose=0, 
                            callbacks=[mlp_checkpointer])
Epoch 00000: val_loss improved from inf to 0.97929, saving model to weights.best.mlp.hdf5
Epoch 00001: val_loss improved from 0.97929 to 0.66325, saving model to weights.best.mlp.hdf5
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss improved from 0.66325 to 0.65604, saving model to weights.best.mlp.hdf5
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss did not improve
In [140]:
hide_code
history_plot(mlp_history)
In [141]:
hide_code
mlp_model.load_weights('weights.best.mlp.hdf5')

y_train_mlp = mlp_model.predict(X_train)
y_test_mlp = mlp_model.predict(X_test)

mlp_model.save('mlp_model_p6.h5')

print(separator, '\nNumeric Features')
scores('MLP Model', y_train, y_test, y_train_mlp, y_test_mlp)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.716729617902
EV score. Test:  0.675551464035
----------
R2 score. Train:  0.712009070283
R2 score. Test:  0.672125965528
----------
MSE score. Train:  0.523734418261
MSE score. Test:  0.65604442956
----------
MAE score. Train:  0.415878808027
MAE score. Test:  0.437741741432
----------
MdAE score. Train:  0.21520614571
MdAE score. Test:  0.225855865118
In [142]:
hide_code
def mlp_cat_model():
    model = Sequential()
    
    model.add(Dense(66, activation='relu', input_dim=44))
    model.add(Dense(66, activation='relu'))
    
    model.add(Dropout(0.1))
    
    model.add(Dense(256, activation='relu'))
    model.add(Dense(256, activation='relu'))
    
    model.add(Dropout(0.1))
    
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    
    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
    return model

mlp_cat_model = mlp_cat_model()
mlp_cat_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat.hdf5', 
                                       verbose=2, save_best_only=True)
mlp_cat_history = mlp_cat_model.fit(X_train_cat, y_train_cat, 
                                    validation_data=(X_test_cat, y_test_cat),
                                    nb_epoch=15, batch_size=128, verbose=0, 
                                    callbacks=[mlp_cat_checkpointer])
Epoch 00000: val_loss improved from inf to 1.03388, saving model to weights.best.mlp_cat.hdf5
Epoch 00001: val_loss did not improve
Epoch 00002: val_loss improved from 1.03388 to 0.70023, saving model to weights.best.mlp_cat.hdf5
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss improved from 0.70023 to 0.63066, saving model to weights.best.mlp_cat.hdf5
Epoch 00006: val_loss improved from 0.63066 to 0.61570, saving model to weights.best.mlp_cat.hdf5
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss did not improve
In [143]:
hide_code
history_plot(mlp_cat_history)
In [144]:
hide_code
mlp_cat_model.load_weights('weights.best.mlp_cat.hdf5')

y_train_cat_mlp = mlp_cat_model.predict(X_train_cat)
y_test_cat_mlp = mlp_cat_model.predict(X_test_cat)

mlp_cat_model.save('mlp_cat_model_p6.h5')

print(separator, '\nNumeric and Categorical Features')
scores('MLP Model', 
       y_train_cat, y_test_cat, y_train_cat_mlp, y_test_cat_mlp)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.732511318066
EV score. Test:  0.692704785781
----------
R2 score. Train:  0.732316762151
R2 score. Test:  0.692288413973
----------
MSE score. Train:  0.486803264918
MSE score. Test:  0.615701308123
----------
MAE score. Train:  0.41593801753
MAE score. Test:  0.443157997067
----------
MdAE score. Train:  0.216797930797
MdAE score. Test:  0.222151017763
In [145]:
hide_code
def mlp_cat_enc_model():
    model = Sequential()
    
    model.add(Dense(159, activation='relu', input_dim=636))
    model.add(Dense(159, activation='relu'))
    
    model.add(Dropout(0.1))
    
    model.add(Dense(318, activation='relu'))
    model.add(Dense(318, activation='relu'))
    
    model.add(Dropout(0.1))
    
    model.add(Dense(636, activation='relu'))
    model.add(Dense(636, activation='relu'))
    
    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])
    return model

mlp_cat_enc_model = mlp_cat_enc_model()
mlp_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat_enc.hdf5', 
                                           verbose=2, save_best_only=True)
mlp_cat_enc_history = mlp_cat_enc_model.fit(X_train_cat_enc, y_train_cat_enc, 
                                            validation_data=(X_test_cat_enc, y_test_cat_enc),
                                            nb_epoch=15, batch_size=128, verbose=0, 
                                            callbacks=[mlp_cat_enc_checkpointer])
Epoch 00000: val_loss improved from inf to 0.76682, saving model to weights.best.mlp_cat_enc.hdf5
Epoch 00001: val_loss did not improve
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss improved from 0.76682 to 0.64222, saving model to weights.best.mlp_cat_enc.hdf5
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss improved from 0.64222 to 0.61162, saving model to weights.best.mlp_cat_enc.hdf5
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss did not improve
In [146]:
hide_code
history_plot(mlp_cat_enc_history)
In [147]:
hide_code
mlp_cat_enc_model.load_weights('weights.best.mlp_cat_enc.hdf5')

y_train_cat_enc_mlp = mlp_cat_enc_model.predict(X_train_cat_enc)
y_test_cat_enc_mlp = mlp_cat_enc_model.predict(X_test_cat_enc)

mlp_cat_enc_model.save('mlp_cat_enc_model_p6.h5')

print(separator, '\nNumeric and Encoded Categorical Features')
scores('MLP Model', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlp, y_test_cat_enc_mlp)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Encoded Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.757303109193
EV score. Test:  0.695575217044
----------
R2 score. Train:  0.754730973643
R2 score. Test:  0.694325912658
----------
MSE score. Train:  0.446041238043
MSE score. Test:  0.611624469088
----------
MAE score. Train:  0.374280232041
MAE score. Test:  0.415592994681
----------
MdAE score. Train:  0.17744043151
MdAE score. Test:  0.193952638648

CNN

In [135]:
hide_code
def cnn_model():
    model = Sequential()
        
    model.add(Conv1D(36, 5, padding='valid', activation='relu', input_shape=(36, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))

    model.add(Conv1D(144, 3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))
    
    model.add(Flatten())

    model.add(Dense(576, activation='relu', kernel_initializer='normal',))
    model.add(Dropout(0.5))
    
    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
    return model

cnn_model = cnn_model()
cnn_checkpointer = ModelCheckpoint(filepath='weights.best.cnn.hdf5', 
                                   verbose=2, save_best_only=True)
cnn_history = cnn_model.fit(X_train.reshape(-1, 36, 1), y_train, 
                            epochs=30, batch_size=128, verbose=0, callbacks=[cnn_checkpointer],
                            validation_data=(X_test.reshape(-1, 36, 1), y_test))
Epoch 00000: val_loss improved from inf to 0.81514, saving model to weights.best.cnn.hdf5
Epoch 00001: val_loss improved from 0.81514 to 0.76718, saving model to weights.best.cnn.hdf5
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss improved from 0.76718 to 0.71464, saving model to weights.best.cnn.hdf5
Epoch 00004: val_loss improved from 0.71464 to 0.69577, saving model to weights.best.cnn.hdf5
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss improved from 0.69577 to 0.65814, saving model to weights.best.cnn.hdf5
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss improved from 0.65814 to 0.65460, saving model to weights.best.cnn.hdf5
Epoch 00009: val_loss improved from 0.65460 to 0.62589, saving model to weights.best.cnn.hdf5
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss did not improve
Epoch 00015: val_loss did not improve
Epoch 00016: val_loss did not improve
Epoch 00017: val_loss improved from 0.62589 to 0.61794, saving model to weights.best.cnn.hdf5
Epoch 00018: val_loss did not improve
Epoch 00019: val_loss did not improve
Epoch 00020: val_loss did not improve
Epoch 00021: val_loss did not improve
Epoch 00022: val_loss did not improve
Epoch 00023: val_loss did not improve
Epoch 00024: val_loss did not improve
Epoch 00025: val_loss did not improve
Epoch 00026: val_loss did not improve
Epoch 00027: val_loss did not improve
Epoch 00028: val_loss did not improve
Epoch 00029: val_loss did not improve
In [136]:
hide_code
history_plot(cnn_history)
In [137]:
hide_code
cnn_model.load_weights('weights.best.cnn.hdf5')

y_train_cnn = cnn_model.predict(X_train.reshape(-1, 36, 1))
y_test_cnn = cnn_model.predict(X_test.reshape(-1, 36, 1))

cnn_model.save('cnn_model_p6.h5')

print(separator, '\nNumeric Features')
scores('CNN Model', y_train, y_test, y_train_cnn, y_test_cnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 CNN Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.696547881356
EV score. Test:  0.692188654739
----------
R2 score. Train:  0.69557579802
R2 score. Test:  0.691169000033
----------
MSE score. Train:  0.553619631301
MSE score. Test:  0.617941147824
----------
MAE score. Train:  0.437340642474
MAE score. Test:  0.458189248255
----------
MdAE score. Train:  0.23846250991
MdAE score. Test:  0.247536358975
In [138]:
hide_code
SVG(model_to_dot(cnn_model).create(prog='dot', format='svg'))
Out[138]:
G 6822401472 conv1d_35_input: InputLayer 6822400128 conv1d_35: Conv1D 6822401472->6822400128 6824290120 max_pooling1d_35: MaxPooling1D 6822400128->6824290120 6831146880 dropout_98: Dropout 6824290120->6831146880 6831146040 conv1d_36: Conv1D 6831146880->6831146040 6831345280 max_pooling1d_36: MaxPooling1D 6831146040->6831345280 6808474680 dropout_99: Dropout 6831345280->6808474680 6808474960 flatten_16: Flatten 6808474680->6808474960 6809071568 dense_191: Dense 6808474960->6809071568 5486401688 dropout_100: Dropout 6809071568->5486401688 5486402808 dense_192: Dense 5486401688->5486402808
In [133]:
hide_code
def cnn_cat_model():
    model = Sequential()
        
    model.add(Conv1D(44, 5, padding='valid', activation='relu', input_shape=(44, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))

    model.add(Conv1D(156, 3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))
    
    model.add(Flatten())

    model.add(Dense(624, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
    return model

cnn_cat_model = cnn_cat_model()
cnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat.hdf5', 
                                       verbose=2, save_best_only=True)
cnn_cat_history = cnn_cat_model.fit(X_train_cat.reshape(-1, 44, 1), y_train_cat, 
                                    epochs=30, batch_size=128, verbose=0, callbacks=[cnn_cat_checkpointer],
                                    validation_data=(X_test_cat.reshape(-1, 44, 1), y_test_cat))
Epoch 00000: val_loss improved from inf to 0.89444, saving model to weights.best.cnn_cat.hdf5
Epoch 00001: val_loss improved from 0.89444 to 0.75230, saving model to weights.best.cnn_cat.hdf5
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss improved from 0.75230 to 0.67255, saving model to weights.best.cnn_cat.hdf5
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss improved from 0.67255 to 0.63895, saving model to weights.best.cnn_cat.hdf5
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss improved from 0.63895 to 0.63473, saving model to weights.best.cnn_cat.hdf5
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss improved from 0.63473 to 0.62870, saving model to weights.best.cnn_cat.hdf5
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss improved from 0.62870 to 0.59221, saving model to weights.best.cnn_cat.hdf5
Epoch 00015: val_loss did not improve
Epoch 00016: val_loss improved from 0.59221 to 0.58469, saving model to weights.best.cnn_cat.hdf5
Epoch 00017: val_loss did not improve
Epoch 00018: val_loss did not improve
Epoch 00019: val_loss did not improve
Epoch 00020: val_loss did not improve
Epoch 00021: val_loss did not improve
Epoch 00022: val_loss did not improve
Epoch 00023: val_loss did not improve
Epoch 00024: val_loss did not improve
Epoch 00025: val_loss did not improve
Epoch 00026: val_loss did not improve
Epoch 00027: val_loss did not improve
Epoch 00028: val_loss did not improve
Epoch 00029: val_loss did not improve
In [134]:
hide_code
history_plot(cnn_cat_history)
In [125]:
hide_code
cnn_cat_model.load_weights('weights.best.cnn_cat.hdf5')

y_train_cat_cnn = cnn_cat_model.predict(X_train_cat.reshape(-1, 44, 1))
y_test_cat_cnn = cnn_cat_model.predict(X_test_cat.reshape(-1, 44, 1))

cnn_cat_model.save('cnn_cat_model_p6.h5')

print(separator, '\nNumeric and Categorical Features')
scores('CNN Model', 
       y_train_cat, y_test_cat, y_train_cat_cnn, y_test_cat_cnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 CNN Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.750772050101
EV score. Test:  0.703572108338
----------
R2 score. Train:  0.750702571788
R2 score. Test:  0.703490345929
----------
MSE score. Train:  0.453367207317
MSE score. Test:  0.593287318946
----------
MAE score. Train:  0.400844047897
MAE score. Test:  0.432183848017
----------
MdAE score. Train:  0.204027751396
MdAE score. Test:  0.208331465191
In [126]:
hide_code
def cnn_cat_enc_model():
    model = Sequential()
        
    model.add(Conv1D(159, 5, padding='valid', activation='relu', input_shape=(636, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))

    model.add(Conv1D(318, 3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))
    
    model.add(Flatten())

    model.add(Dense(636, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
    return model

cnn_cat_enc_model = cnn_cat_enc_model()
cnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat_enc.hdf5', 
                                           verbose=2, save_best_only=True)
cnn_cat_enc_history = \
cnn_cat_enc_model.fit(X_train_cat_enc.reshape(-1, 636, 1), y_train_cat_enc, 
                      epochs=20, batch_size=128, verbose=0, callbacks=[cnn_cat_enc_checkpointer],
                      validation_data=(X_test_cat_enc.reshape(-1, 636, 1), y_test_cat_enc))
Epoch 00000: val_loss improved from inf to 0.85929, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00001: val_loss improved from 0.85929 to 0.81947, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00002: val_loss improved from 0.81947 to 0.67931, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss improved from 0.67931 to 0.64155, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00006: val_loss improved from 0.64155 to 0.64041, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss improved from 0.64041 to 0.63223, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss improved from 0.63223 to 0.62492, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00015: val_loss did not improve
Epoch 00016: val_loss improved from 0.62492 to 0.61702, saving model to weights.best.cnn_cat_enc.hdf5
Epoch 00017: val_loss did not improve
Epoch 00018: val_loss did not improve
Epoch 00019: val_loss did not improve
In [127]:
hide_code
history_plot(cnn_cat_enc_history)
In [128]:
hide_code
cnn_cat_enc_model.load_weights('weights.best.cnn_cat_enc.hdf5')

y_train_cat_enc_cnn = cnn_cat_enc_model.predict(X_train_cat_enc.reshape(-1, 636, 1))
y_test_cat_enc_cnn = cnn_cat_enc_model.predict(X_test_cat_enc.reshape(-1, 636, 1))

cnn_cat_enc_model.save('cnn_cat_enc_model_p6.h5')

print(separator, '\nNumeric and Encoded Categorical Features')
scores('CNN Model', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_cnn, y_test_cat_enc_cnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Encoded Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 CNN Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.759131838049
EV score. Test:  0.691962440001
----------
R2 score. Train:  0.758228668412
R2 score. Test:  0.69162908674
----------
MSE score. Train:  0.439680401829
MSE score. Test:  0.617020558546
----------
MAE score. Train:  0.406652647493
MAE score. Test:  0.447075096584
----------
MdAE score. Train:  0.227229965528
MdAE score. Test:  0.24142826928

RNN

In [152]:
hide_code
def rnn_model():
    model = Sequential()
    
    model.add(LSTM(144, return_sequences=True, input_shape=(1, 36)))
    model.add(LSTM(576, return_sequences=False))   
    
    model.add(Dense(1))

    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])     
    return model 

rnn_model = rnn_model()
rnn_checkpointer = ModelCheckpoint(filepath='weights.best.rnn.hdf5', 
                                   verbose=2, save_best_only=True)
rnn_history = rnn_model.fit(X_train.reshape(-1, 1, 36), y_train.reshape(-1), 
                            epochs=7, verbose=0, callbacks=[rnn_checkpointer],
                            validation_data=(X_test.reshape(-1, 1, 36), y_test.reshape(-1)))
Epoch 00000: val_loss improved from inf to 0.68238, saving model to weights.best.rnn.hdf5
Epoch 00001: val_loss improved from 0.68238 to 0.64374, saving model to weights.best.rnn.hdf5
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss improved from 0.64374 to 0.63747, saving model to weights.best.rnn.hdf5
In [153]:
hide_code
history_plot(rnn_history)
In [154]:
hide_code
rnn_model.load_weights('weights.best.rnn.hdf5')

y_train_rnn = rnn_model.predict(X_train.reshape(-1, 1, 36))
y_test_rnn = rnn_model.predict(X_test.reshape(-1, 1, 36))

rnn_model.save('rnn_model_p6.h5')

print(separator, '\nNumeric Features')
scores('RNN Model', y_train, y_test, y_train_rnn, y_test_rnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 RNN Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.682874181327
EV score. Test:  0.681416378767
----------
R2 score. Train:  0.682810505694
R2 score. Test:  0.681410437544
----------
MSE score. Train:  0.576834331005
MSE score. Test:  0.637467093428
----------
MAE score. Train:  0.440112160532
MAE score. Test:  0.456550351971
----------
MdAE score. Train:  0.22889533586
MdAE score. Test:  0.248062273713
In [158]:
hide_code
def rnn_cat_model():
    model = Sequential()
    
    model.add(LSTM(156, return_sequences=True, input_shape=(1, 44)))
    model.add(LSTM(624, return_sequences=False))   
    
    model.add(Dense(1))

    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])     
    return model 

rnn_cat_model = rnn_cat_model()
rnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat.hdf5', 
                                       verbose=2, save_best_only=True)
rnn_cat_history = rnn_cat_model.fit(X_train_cat.reshape(-1, 1, 44), y_train_cat.reshape(-1), 
                                    epochs=10, verbose=0, callbacks=[rnn_cat_checkpointer],
                                    validation_data=(X_test_cat.reshape(-1, 1, 44), y_test_cat.reshape(-1)))
Epoch 00000: val_loss improved from inf to 0.67332, saving model to weights.best.rnn_cat.hdf5
Epoch 00001: val_loss did not improve
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss improved from 0.67332 to 0.62988, saving model to weights.best.rnn_cat.hdf5
Epoch 00004: val_loss improved from 0.62988 to 0.61291, saving model to weights.best.rnn_cat.hdf5
Epoch 00005: val_loss improved from 0.61291 to 0.60830, saving model to weights.best.rnn_cat.hdf5
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
In [159]:
hide_code
history_plot(rnn_cat_history)
In [160]:
hide_code
rnn_cat_model.load_weights('weights.best.rnn_cat.hdf5')

y_train_cat_rnn = rnn_cat_model.predict(X_train_cat.reshape(-1, 1, 44))
y_test_cat_rnn = rnn_cat_model.predict(X_test_cat.reshape(-1, 1, 44))

rnn_cat_model.save('rnn_cat_model_p6.h5')

print(separator, '\nNumeric and Categorical Features')
scores('RNN Model', 
       y_train_cat, y_test_cat, y_train_cat_rnn, y_test_cat_rnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 RNN Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.703780922615
EV score. Test:  0.696926229658
----------
R2 score. Train:  0.703186185353
R2 score. Test:  0.695985951518
----------
MSE score. Train:  0.539779536455
MSE score. Test:  0.608302890882
----------
MAE score. Train:  0.425810817364
MAE score. Test:  0.438739487756
----------
MdAE score. Train:  0.227912049625
MdAE score. Test:  0.240482621193
In [161]:
hide_code
def rnn_cat_enc_model():
    model = Sequential()
    
    model.add(LSTM(159, return_sequences=True, input_shape=(1, 636)))
    model.add(LSTM(636, return_sequences=False))   
    
    model.add(Dense(1))

    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])     
    return model 

rnn_cat_enc_model = rnn_cat_enc_model()
rnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat_enc.hdf5', 
                                           verbose=2, save_best_only=True)
rnn_cat_enc_history = \
rnn_cat_enc_model.fit(X_train_cat_enc.reshape(-1, 1, 636), y_train_cat_enc.reshape(-1), 
                      epochs=10, verbose=0, callbacks=[rnn_cat_enc_checkpointer],
                      validation_data=(X_test_cat_enc.reshape(-1, 1, 636), y_test_cat_enc.reshape(-1)))
Epoch 00000: val_loss improved from inf to 0.63510, saving model to weights.best.rnn_cat_enc.hdf5
Epoch 00001: val_loss did not improve
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss improved from 0.63510 to 0.59561, saving model to weights.best.rnn_cat_enc.hdf5
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
In [162]:
hide_code
history_plot(rnn_cat_enc_history)
In [163]:
hide_code
rnn_cat_enc_model.load_weights('weights.best.rnn_cat_enc.hdf5')

y_train_cat_enc_rnn = rnn_cat_enc_model.predict(X_train_cat_enc.reshape(-1, 1, 636))
y_test_cat_enc_rnn = rnn_cat_enc_model.predict(X_test_cat_enc.reshape(-1, 1, 636))

rnn_cat_enc_model.save('rnn_cat_enc_model_p6.h5')

print(separator, '\nNumeric and Encoded Categorical Features')
scores('RNN Model', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_rnn, y_test_cat_enc_rnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
Numeric and Encoded Categorical Features
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 RNN Model 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score. Train:  0.716781030467
EV score. Test:  0.702329477206
----------
R2 score. Train:  0.716601134836
R2 score. Test:  0.702329404906
----------
MSE score. Train:  0.515383383525
MSE score. Test:  0.595610250349
----------
MAE score. Train:  0.400474366173
MAE score. Test:  0.420455708375
----------
MdAE score. Train:  0.200157138507
MdAE score. Test:  0.208026864515

Display Predictions

In [164]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test[1:50], color = 'black', label='Real Data')

plt.plot(y_test_mlp[1:50], label='MLP')
plt.plot(y_test_cnn[1:50], label='CNN')
plt.plot(y_test_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric Features; Neural Network Predictions vs Real Data");
In [165]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_mlp[1:50], label='MLP')
plt.plot(y_test_cat_cnn[1:50], label='CNN')
plt.plot(y_test_cat_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Categorical Features; Neural Network Predictions vs Real Data");
In [166]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(y_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(y_test_cat_enc_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Neural Network Predictions vs Real Data");

Evaluation Metrics and Predictions

  • explained variance regression score
  • coefficient of determination
  • mean squared error
  • mean absolute error
  • median absolute error
In [168]:
hide_code
target_scale = RobustScaler()
s_target_train = target_scale.fit_transform(target_train.reshape(-1,1))
######################################################################################
feature_scale = RobustScaler()
s_features_train = feature_scale.fit_transform(features_train)
s_features_test = feature_scale.transform(features_test)
######################################################################################
feature_cat_scale = RobustScaler()
s_features_train_cat = feature_cat_scale.fit_transform(features_train_cat)
s_features_test_cat = feature_cat_scale.transform(features_test_cat)
######################################################################################
feature_cat_enc_scale = RobustScaler()
s_features_train_cat_enc = feature_cat_enc_scale.fit_transform(features_train_cat_enc)
s_features_test_cat_enc = feature_cat_enc_scale.transform(features_test_cat_enc)

Regressors; Scikit-Learn

Numeric Features

In [159]:
hide_code
gbr = GradientBoostingRegressor(max_depth=4, n_estimators=360)
gbr.fit(s_features_train, s_target_train)
br = BaggingRegressor(n_estimators=360)
br.fit(s_features_train, s_target_train)

s_target_train_gbr = gbr.predict(s_features_train)
s_target_test_gbr = gbr.predict(s_features_test)
s_target_train_br = br.predict(s_features_train)
s_target_test_br = br.predict(s_features_test)
s_target_train_mlpr = mlpr.predict(s_features_train)
s_target_test_mlpr = mlpr.predict(s_features_test)

scores2('Gradient Boosting Regressor', s_target_train, s_target_train_gbr)
scores2('Bagging Regressor', s_target_train, s_target_train_br)
scores2('MLP Regressor', s_target_train, s_target_train_mlpr)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 Gradient Boosting Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.851729559483
----------
R2 score: 0.851729559483
----------
MSE score: 0.273663122104
----------
MAE score: 0.324355312761
----------
MdAE score: 0.17539487972
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 Bagging Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.958673168561
----------
R2 score: 0.958649811062
----------
MSE score: 0.0763201469208
----------
MAE score: 0.144578783039
----------
MdAE score: 0.0628286831822
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.699354284618
----------
R2 score: 0.69932065661
----------
MSE score: 0.554964614506
----------
MAE score: 0.423522294048
----------
MdAE score: 0.234736923498

Numeric and Categorical Features

In [160]:
hide_code
gbr_cat = GradientBoostingRegressor(max_depth=3, n_estimators=396)
gbr_cat.fit(s_features_train_cat, s_target_train)
br_cat = BaggingRegressor(n_estimators=308)
br_cat.fit(s_features_train_cat, s_target_train)

s_target_train_cat_gbr = gbr_cat.predict(s_features_train_cat)
s_target_test_cat_gbr = gbr_cat.predict(s_features_test_cat)
s_target_train_cat_br = br_cat.predict(s_features_train_cat)
s_target_test_cat_br = br_cat.predict(s_features_test_cat)
s_target_train_cat_mlpr = mlpr_cat.predict(s_features_train_cat)
s_target_test_cat_mlpr = mlpr_cat.predict(s_features_test_cat)

scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_gbr)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_br)
scores2('MLP Regressor', s_target_train, s_target_train_cat_mlpr)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 Gradient Boosting Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.813227343634
----------
R2 score: 0.813227343634
----------
MSE score: 0.344726757987
----------
MAE score: 0.357167721423
----------
MdAE score: 0.190744564286
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 Bagging Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.958401319901
----------
R2 score: 0.958376833249
----------
MSE score: 0.0768239827516
----------
MAE score: 0.144458842366
----------
MdAE score: 0.0621988196009
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.713929709415
----------
R2 score: 0.71392710523
----------
MSE score: 0.528005455834
----------
MAE score: 0.415068371286
----------
MdAE score: 0.226551595988

Numeric and Encoded Categorical Features

In [161]:
hide_code
gbr_cat_enc = GradientBoostingRegressor(max_depth=4, n_estimators=318)
gbr_cat_enc.fit(s_features_train_cat_enc, s_target_train)
br_cat_enc = BaggingRegressor(n_estimators=159)
br_cat_enc.fit(s_features_train_cat_enc, s_target_train)

s_target_train_cat_enc_gbr = gbr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_gbr = gbr_cat_enc.predict(s_features_test_cat_enc)
s_target_train_cat_enc_br = br_cat.predict(s_features_train_cat_enc)
s_target_test_cat_enc_br = br_cat.predict(s_features_test_cat_enc)
s_target_train_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_test_cat_enc)

scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_enc_gbr)
scores2('Bagging Regressor', s_target_train, s_target_train_cat_enc_br)
scores2('MLP Regressor', s_target_train, s_target_train_cat_enc_mlpr)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 Gradient Boosting Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.835548897222
----------
R2 score: 0.835548897222
----------
MSE score: 0.303527810822
----------
MAE score: 0.339775897345
----------
MdAE score: 0.183338754074
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 Bagging Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.919889093965
----------
R2 score: 0.919788326366
----------
MSE score: 0.148046886213
----------
MAE score: 0.201599986829
----------
MdAE score: 0.0919321248199
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP Regressor 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.741545391123
----------
R2 score: 0.74151850405
----------
MSE score: 0.477079942171
----------
MAE score: 0.403509095175
----------
MdAE score: 0.214798934835

Neural Networks; Keras

Numeric Features

In [169]:
hide_code
s_target_train_mlp = mlp_model.predict(s_features_train)
s_target_test_mlp = mlp_model.predict(s_features_test)
s_target_train_cnn = cnn_model.predict(s_features_train.reshape(-1, 36, 1))
s_target_test_cnn = cnn_model.predict(s_features_test.reshape(-1, 36, 1))
s_target_train_rnn = rnn_model.predict(s_features_train.reshape(-1, 1, 36))
s_target_test_rnn = rnn_model.predict(s_features_test.reshape(-1, 1, 36))

scores2('MLP', s_target_train, s_target_train_mlp)
scores2('CNN', s_target_train, s_target_train_cnn)
scores2('RNN', s_target_train, s_target_train_rnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.70801498175
----------
R2 score: 0.703616032067
----------
MSE score: 0.547036629305
----------
MAE score: 0.41892950256
----------
MdAE score: 0.215972725006
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 CNN 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.695716399825
----------
R2 score: 0.694795067187
----------
MSE score: 0.563317506198
----------
MAE score: 0.440605246339
----------
MdAE score: 0.240294961817
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 RNN 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.682438069906
----------
R2 score: 0.682417002303
----------
MSE score: 0.586163731448
----------
MAE score: 0.442514572875
----------
MdAE score: 0.232217242446

Numeric and Categorical Features

In [170]:
hide_code
s_target_train_cat_mlp = mlp_cat_model.predict(s_features_train_cat)
s_target_test_cat_mlp = mlp_cat_model.predict(s_features_test_cat)
s_target_train_cat_cnn = cnn_cat_model.predict(s_features_train_cat.reshape(-1, 44, 1))
s_target_test_cat_cnn = cnn_cat_model.predict(s_features_test_cat.reshape(-1, 44, 1))
s_target_train_cat_rnn = rnn_cat_model.predict(s_features_train_cat.reshape(-1, 1, 44))
s_target_test_cat_rnn = rnn_cat_model.predict(s_features_test_cat.reshape(-1, 1, 44))

scores2('MLP', s_target_train, s_target_train_cat_mlp)
scores2('CNN', s_target_train, s_target_train_cat_cnn)
scores2('RNN', s_target_train, s_target_train_cat_rnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.723838716688
----------
R2 score: 0.723563406127
----------
MSE score: 0.510219711219
----------
MAE score: 0.420804252981
----------
MdAE score: 0.218250214302
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 CNN 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.729185562536
----------
R2 score: 0.727091299322
----------
MSE score: 0.503708269945
----------
MAE score: 0.413977381429
----------
MdAE score: 0.213753531249
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 RNN 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.702369817951
----------
R2 score: 0.701672701335
----------
MSE score: 0.550623439686
----------
MAE score: 0.427606218429
----------
MdAE score: 0.230481679502

Numeric and Encoded Categorical Features

In [171]:
hide_code
s_target_train_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_test_cat_enc)
s_target_train_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(-1, 636, 1))
s_target_test_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(-1, 636, 1))
s_target_train_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(-1, 1, 636))
s_target_test_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(-1, 1, 636))

scores2('MLP', s_target_train, s_target_train_cat_enc_mlp)
scores2('CNN', s_target_train, s_target_train_cat_enc_cnn)
scores2('RNN', s_target_train, s_target_train_cat_enc_rnn)
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 MLP 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.744195527126
----------
R2 score: 0.741929632765
----------
MSE score: 0.476321120877
----------
MAE score: 0.381384959997
----------
MdAE score: 0.180834282854
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 CNN 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.744802961394
----------
R2 score: 0.744015991498
----------
MSE score: 0.472470323356
----------
MAE score: 0.41345397374
----------
MdAE score: 0.229116513316
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_> 
 RNN 
<_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_><_>
EV score: 0.713772527422
----------
R2 score: 0.713687300105
----------
MSE score: 0.528448064751
----------
MAE score: 0.403821263767
----------
MdAE score: 0.202283175896

Display All Predictions

In [165]:
hide_code
# Rescale Predictions
target_train_gbr = target_scale.inverse_transform(s_target_train_gbr.reshape(-1,1))
target_test_gbr = target_scale.inverse_transform(s_target_test_gbr.reshape(-1,1))
target_train_br = target_scale.inverse_transform(s_target_train_br.reshape(-1,1))
target_test_br = target_scale.inverse_transform(s_target_test_br.reshape(-1,1))
target_train_mlpr = target_scale.inverse_transform(s_target_train_mlpr.reshape(-1,1))
target_test_mlpr = target_scale.inverse_transform(s_target_test_mlpr.reshape(-1,1))

target_train_mlp = target_scale.inverse_transform(s_target_train_mlp)
target_test_mlp = target_scale.inverse_transform(s_target_test_mlp)
target_train_cnn = target_scale.inverse_transform(s_target_train_cnn)
target_test_cnn = target_scale.inverse_transform(s_target_test_cnn)
target_train_rnn = target_scale.inverse_transform(s_target_train_rnn)
target_test_rnn = target_scale.inverse_transform(s_target_test_rnn)
In [166]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')

plt.plot(target_train_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_br[1:50], label='Bagging Regressor')
plt.plot(target_train_mlpr[1:50], label='MLP Regressor')

plt.plot(target_train_mlp[1:50], label='MLP')
plt.plot(target_train_cnn[1:50], label='CNN')
plt.plot(target_train_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric Features; Train Predictions vs Real Data");
In [167]:
hide_code
plt.figure(figsize = (18, 6))

plt.plot(target_test_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_br[1:50], label='Bagging Regressor')
plt.plot(target_test_mlpr[1:50], label='MLP Regressor')

plt.plot(target_test_mlp[1:50], label='MLP')
plt.plot(target_test_cnn[1:50], label='CNN')
plt.plot(target_test_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric Features; Test Predictions");
In [168]:
hide_code
# Rescale Predictions
target_train_cat_gbr = target_scale.inverse_transform(s_target_train_cat_gbr.reshape(-1,1))
target_test_cat_gbr = target_scale.inverse_transform(s_target_test_cat_gbr.reshape(-1,1))
target_train_cat_br = target_scale.inverse_transform(s_target_train_cat_br.reshape(-1,1))
target_test_cat_br = target_scale.inverse_transform(s_target_test_cat_br.reshape(-1,1))
target_train_cat_mlpr = target_scale.inverse_transform(s_target_train_cat_mlpr.reshape(-1,1))
target_test_cat_mlpr = target_scale.inverse_transform(s_target_test_cat_mlpr.reshape(-1,1))

target_train_cat_mlp = target_scale.inverse_transform(s_target_train_cat_mlp.reshape(-1,1))
target_test_cat_mlp = target_scale.inverse_transform(s_target_test_cat_mlp.reshape(-1,1))
target_train_cat_cnn = target_scale.inverse_transform(s_target_train_cat_cnn.reshape(-1,1))
target_test_cat_cnn = target_scale.inverse_transform(s_target_test_cat_cnn.reshape(-1,1))
target_train_cat_rnn = target_scale.inverse_transform(s_target_train_cat_rnn.reshape(-1,1))
target_test_cat_rnn = target_scale.inverse_transform(s_target_test_cat_rnn.reshape(-1,1))
In [169]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')

plt.plot(target_train_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_mlpr[1:50], label='MLP Regressor')

plt.plot(target_train_cat_mlp[1:50], label='MLP')
plt.plot(target_train_cat_cnn[1:50], label='CNN')
plt.plot(target_train_cat_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Categorical Features; Train Predictions vs Real Data");
In [170]:
hide_code
plt.figure(figsize = (18, 6))

plt.plot(target_test_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_mlpr[1:50], label='MLP Regressor')

plt.plot(target_test_cat_mlp[1:50], label='MLP')
plt.plot(target_test_cat_cnn[1:50], label='CNN')
plt.plot(target_test_cat_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Categorical Features; Test Predictions");
In [171]:
hide_code
# Rescale Predictions
target_train_cat_enc_gbr = target_scale.inverse_transform(s_target_train_cat_enc_gbr.reshape(-1,1))
target_test_cat_enc_gbr = target_scale.inverse_transform(s_target_test_cat_enc_gbr.reshape(-1,1))
target_train_cat_enc_br = target_scale.inverse_transform(s_target_train_cat_enc_br.reshape(-1,1))
target_test_cat_enc_br = target_scale.inverse_transform(s_target_test_cat_enc_br.reshape(-1,1))
target_train_cat_enc_mlpr = target_scale.inverse_transform(s_target_train_cat_enc_mlpr.reshape(-1,1))
target_test_cat_enc_mlpr = target_scale.inverse_transform(s_target_test_cat_enc_mlpr.reshape(-1,1))

target_train_cat_enc_mlp = target_scale.inverse_transform(s_target_train_cat_enc_mlp.reshape(-1,1))
target_test_cat_enc_mlp = target_scale.inverse_transform(s_target_test_cat_enc_mlp.reshape(-1,1))
target_train_cat_enc_cnn = target_scale.inverse_transform(s_target_train_cat_enc_cnn.reshape(-1,1))
target_test_cat_enc_cnn = target_scale.inverse_transform(s_target_test_cat_enc_cnn.reshape(-1,1))
target_train_cat_enc_rnn = target_scale.inverse_transform(s_target_train_cat_enc_rnn.reshape(-1,1))
target_test_cat_enc_rnn = target_scale.inverse_transform(s_target_test_cat_enc_rnn.reshape(-1,1))
In [172]:
hide_code
plt.figure(figsize = (18, 6))
plt.plot(target_train[1:50], color = 'black', label='Real Data')

plt.plot(target_train_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_enc_mlpr[1:50], label='MLP Regressor')

plt.plot(target_train_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_train_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_train_cat_enc_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Train Predictions vs Real Data");
In [173]:
hide_code
plt.figure(figsize = (18, 6))

plt.plot(target_test_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_enc_mlpr[1:50], label='MLP Regressor')

plt.plot(target_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_test_cat_enc_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Test Predictions");

Project Design

The project was built on the basis of the competition offered on the site https://www.kaggle.com.

The competition version of this notebook is avalible here: https://www.kaggle.com/olgabelitskaya/sberbank-russian-housing-market .

There are several popular resources (numpy, pandas, matplotlib, scikit-learn and keras) for regression models were used.

The most valuable in this project is the study of real data and the attempt to approximate the predictions on them to the threshold of 70-80 percent.